{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# for Python 2: use print only as a function\n",
    "from __future__ import print_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# read file into pandas using a relative path\n",
    "path = 'data/sms.tsv'\n",
    "sms = pd.read_table(path, header=None, names=['label', 'message'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5572, 2)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the shape\n",
    "sms.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>message</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ham</td>\n",
       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ham</td>\n",
       "      <td>Ok lar... Joking wif u oni...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>spam</td>\n",
       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ham</td>\n",
       "      <td>U dun say so early hor... U c already then say...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ham</td>\n",
       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spam</td>\n",
       "      <td>FreeMsg Hey there darling it's been 3 week's n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>ham</td>\n",
       "      <td>Even my brother is not like to speak with me. ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ham</td>\n",
       "      <td>As per your request 'Melle Melle (Oru Minnamin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>spam</td>\n",
       "      <td>WINNER!! As a valued network customer you have...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>spam</td>\n",
       "      <td>Had your mobile 11 months or more? U R entitle...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label                                            message\n",
       "0   ham  Go until jurong point, crazy.. Available only ...\n",
       "1   ham                      Ok lar... Joking wif u oni...\n",
       "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
       "3   ham  U dun say so early hor... U c already then say...\n",
       "4   ham  Nah I don't think he goes to usf, he lives aro...\n",
       "5  spam  FreeMsg Hey there darling it's been 3 week's n...\n",
       "6   ham  Even my brother is not like to speak with me. ...\n",
       "7   ham  As per your request 'Melle Melle (Oru Minnamin...\n",
       "8  spam  WINNER!! As a valued network customer you have...\n",
       "9  spam  Had your mobile 11 months or more? U R entitle..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the first 10 rows\n",
    "sms.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ham     4825\n",
       "spam     747\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the class distribution\n",
    "sms.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# convert label to a numerical variable\n",
    "sms['label_num'] = sms.label.map({'ham':0, 'spam':1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>message</th>\n",
       "      <th>label_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ham</td>\n",
       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ham</td>\n",
       "      <td>Ok lar... Joking wif u oni...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>spam</td>\n",
       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ham</td>\n",
       "      <td>U dun say so early hor... U c already then say...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ham</td>\n",
       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>spam</td>\n",
       "      <td>FreeMsg Hey there darling it's been 3 week's n...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>ham</td>\n",
       "      <td>Even my brother is not like to speak with me. ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>ham</td>\n",
       "      <td>As per your request 'Melle Melle (Oru Minnamin...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>spam</td>\n",
       "      <td>WINNER!! As a valued network customer you have...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>spam</td>\n",
       "      <td>Had your mobile 11 months or more? U R entitle...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label                                            message  label_num\n",
       "0   ham  Go until jurong point, crazy.. Available only ...          0\n",
       "1   ham                      Ok lar... Joking wif u oni...          0\n",
       "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...          1\n",
       "3   ham  U dun say so early hor... U c already then say...          0\n",
       "4   ham  Nah I don't think he goes to usf, he lives aro...          0\n",
       "5  spam  FreeMsg Hey there darling it's been 3 week's n...          1\n",
       "6   ham  Even my brother is not like to speak with me. ...          0\n",
       "7   ham  As per your request 'Melle Melle (Oru Minnamin...          0\n",
       "8  spam  WINNER!! As a valued network customer you have...          1\n",
       "9  spam  Had your mobile 11 months or more? U R entitle...          1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check that the conversion worked\n",
    "sms.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(5572,)\n",
      "(5572,)\n"
     ]
    }
   ],
   "source": [
    "# how to define X and y (from the SMS data) for use with COUNTVECTORIZER\n",
    "X = sms.message\n",
    "y = sms.label_num\n",
    "print(X.shape)\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(4179,)\n",
      "(1393,)\n",
      "(4179,)\n",
      "(1393,)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# split X and y into training and testing sets\n",
    "from sklearn.cross_validation import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n",
    "print(X_train.shape)\n",
    "print(X_test.shape)\n",
    "print(y_train.shape)\n",
    "print(y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import and instantiate CountVectorizer (with the default parameters)\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "# instantiate the vectorizer\n",
    "vect = CountVectorizer()\n",
    "# learn training data vocabulary, then use it to create a document-term matrix\n",
    "vect.fit(X_train)\n",
    "X_train_dtm = vect.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# equivalently: combine fit and transform into a single step\n",
    "X_train_dtm = vect.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<4179x7456 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 55209 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the document-term matrix\n",
    "X_train_dtm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1393x7456 sparse matrix of type '<type 'numpy.int64'>'\n",
       "\twith 17604 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# transform testing data (using fitted vocabulary) into a document-term matrix\n",
    "X_test_dtm = vect.transform(X_test)\n",
    "X_test_dtm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import linear_model\n",
    "clf = linear_model.LogisticRegression(C=1e5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 32 ms, sys: 0 ns, total: 32 ms\n",
      "Wall time: 32.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train the model using X_train_dtm (timing it with an IPython \"magic command\")\n",
    "%time clf.fit(X_train_dtm, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# make class predictions for X_test_dtm\n",
    "y_pred_class = clf.predict(X_test_dtm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.98851399856424982"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate accuracy of class predictions\n",
    "from sklearn import metrics\n",
    "metrics.accuracy_score(y_test, y_pred_class)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1205,    3],\n",
       "       [  13,  172]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the confusion matrix\n",
    "metrics.confusion_matrix(y_test, y_pred_class)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2340    Cheers for the message Zogtorius. Ive been st...\n",
       "4009    Forgot you were working today! Wanna chat, but...\n",
       "1497    I'm always on yahoo messenger now. Just send t...\n",
       "Name: message, dtype: object"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print message text for the false positives (ham incorrectly classified as spam)\n",
    "X_test[y_test < y_pred_class]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1777                    Call FREEPHONE 0800 542 0578 now!\n",
       "763     Urgent Ur £500 guaranteed award is still uncla...\n",
       "3132    LookAtMe!: Thanks for your purchase of a video...\n",
       "1875    Would you like to see my XXX pics they are so ...\n",
       "1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...\n",
       "4298    thesmszone.com lets you send free anonymous an...\n",
       "4394    RECPT 1/3. You have ordered a Ringtone. Your o...\n",
       "4949    Hi this is Amy, we will be sending you a free ...\n",
       "761     Romantic Paris. 2 nights, 2 flights from £79 B...\n",
       "19      England v Macedonia - dont miss the goals/team...\n",
       "2821    INTERFLORA - It's not too late to order Inter...\n",
       "2247    Hi ya babe x u 4goten bout me?' scammers getti...\n",
       "4514    Money i have won wining number 946 wot do i do...\n",
       "Name: message, dtype: object"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print message text for the false negatives (spam incorrectly classified as ham)\n",
    "X_test[y_test > y_pred_class]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323.\""
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example false negative\n",
    "X_test[3132]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  9.90605877e-07,   4.03318205e-09,   1.38284799e-07, ...,\n",
       "         6.48403564e-06,   1.00000000e+00,   3.77161208e-09])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate predicted probabilities for X_test_dtm (poorly calibrated)\n",
    "y_pred_prob = clf.predict_proba(X_test_dtm)[:, 1]\n",
    "y_pred_prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.99326114193663873"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate AUC\n",
    "metrics.roc_auc_score(y_test, y_pred_prob)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}